4  Practical 3 / Pracical Day 4

4.1 Question 1:

4.1.1 Instructions:

Display the flights dataset in an alternative format to simply printing it (i.e. running flights).

4.1.2 Answers:

Code
tibble(flights)
# A tibble: 336,776 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>
Code
UtilsDataRSV::view_cols(flights)
[1] "year"
[1] 410
[1] "_____________________"
[1] "month"
[1] 11  8  4  1  6
[1] "_____________________"
[1] "day"
[1] 21 25 22 14 20
[1] "_____________________"
[1] "dep_time"
[1] 1342  707  710 2000   NA
[1] "_____________________"
[1] "sched_dep_time"
[1]  603  944 1848 1441  755
[1] "_____________________"
[1] "dep_delay"
[1]  53 911 103 261  NA
[1] "_____________________"
[1] "arr_time"
[1] 1506 2230 1907  139   NA
[1] "_____________________"
[1] "sched_arr_time"
[1]  720 1001  827 1523 1624
[1] "_____________________"
[1] "arr_delay"
[1]  81 123 367   4  NA
[1] "_____________________"
[1] "carrier"
 [1] "AS" "HA" "FL" "B6" "MQ" "EV" "DL" "OO" "AA" "YV" "WN" "9E" "UA" "US" "VX"
[16] "F9"
[1] "_____________________"
[1] "flight"
[1] 4692 4225 4422  199 3824
[1] "_____________________"
[1] "tailnum"
 [1] "N493UA" "N386AA" "N657UA" "N175DN" "N7CAAA" "N57111" "N903DA" "N3758Y"
 [9] "N811UA" "N600LR" "N724MQ" "N385DN" "N183UW" "N318AT" "N912DL" "N695DL"
[17] "N8886A" "N13718" "N7AYAA" NA      
[1] "4024 unique entries not displayed"
[1] "_____________________"
[1] "origin"
[1] "LGA" "JFK" "EWR"
[1] "_____________________"
[1] "dest"
 [1] "OKC" "SBN" "MYR" "BOS" "SFO" "PVD" "MEM" "BUF" "TUL" "SMF" "ROC" "CRW"
[13] "MCI" "HNL" "DSM" "CLE" "RSW" "MTJ" "DTW" "XNA"
[1] "85 unique entries not displayed"
[1] "_____________________"
[1] "air_time"
[1] 640 267  99 288  NA
[1] "_____________________"
[1] "distance"
[1] 2378 1726  416  500 1023
[1] "_____________________"
[1] "hour"
[1]  5  6 17 18 19
[1] "_____________________"
[1] "minute"
[1] 43 44 50 25 13
[1] "_____________________"
[1] "time_hour"
 [1] "2013-02-25 11:00:00 EST" "2013-06-06 09:00:00 EDT"
 [3] "2013-11-15 12:00:00 EST" "2013-05-01 10:00:00 EDT"
 [5] "2013-05-30 23:00:00 EDT" "2013-05-07 08:00:00 EDT"
 [7] "2013-07-06 11:00:00 EDT" "2013-06-24 17:00:00 EDT"
 [9] "2013-01-23 14:00:00 EST" "2013-01-18 07:00:00 EST"
[11] "2013-11-05 21:00:00 EST" "2013-01-19 06:00:00 EST"
[13] "2013-08-04 12:00:00 EDT" "2013-01-27 05:00:00 EST"
[15] "2013-02-03 17:00:00 EST" "2013-09-22 22:00:00 EDT"
[17] "2013-08-13 09:00:00 EDT" "2013-07-04 19:00:00 EDT"
[19] "2013-11-16 22:00:00 EST" "2013-11-05 20:00:00 EST"
[1] "6916 unique entries not displayed"
[1] "_____________________"
Warning: Not all unique entries displayed for these non-numeric cols: tailnum,
dest, time_hour

4.2 Question 2:

4.2.1 Instructions:

Rewrite this code using dplyr and the pipe:

Code
flight1 <- flights[flights$month == 1, ]
carrier_vec <- unique(flight1$carrier)
carrier_dist_vec_mean <- numeric(length(carrier_vec))
carrier_dist_vec_sd <- numeric(length(carrier_vec))
for (i in seq_along(carrier_vec)) {
  carrier_dist_vec_mean[i] <- mean(
    flight1$distance[flight1$carrier == carrier_vec[i]]
   )
  carrier_dist_vec_sd[i] <- sd(
    flight1$distance[flight1$carrier == carrier_vec[i]]
  )
}
dist_tbl <- tibble(
  carrier = carrier_vec,
  mean_distance = carrier_dist_vec_mean,
  sd_distance = carrier_dist_vec_sd
)
dist_tbl[order(dist_tbl$mean_distance), ]
# A tibble: 16 × 3
   carrier mean_distance sd_distance
   <chr>           <dbl>       <dbl>
 1 YV               229          0  
 2 9E               476.       334. 
 3 EV               522.       294. 
 4 US               536.       553. 
 5 MQ               566.       223. 
 6 FL               691.       142. 
 7 OO               733         NA  
 8 WN               942.       496. 
 9 B6              1062.       681. 
10 DL              1220.       644. 
11 AA              1350.       626. 
12 UA              1462.       778. 
13 F9              1620          0  
14 AS              2402          0  
15 VX              2495.        98.2
16 HA              4983          0  

4.2.2 Answers:

Code
dist_tbl2 <- flights |>
  
  filter(month == 1) |>
  
    group_by(carrier) |>  
  
      summarize(
        mean_distance = mean(distance),  
        sd_distance   = sd(distance)
        ) |>
  
          arrange(mean_distance)                      

dist_tbl2
# A tibble: 16 × 3
   carrier mean_distance sd_distance
   <chr>           <dbl>       <dbl>
 1 YV               229          0  
 2 9E               476.       334. 
 3 EV               522.       294. 
 4 US               536.       553. 
 5 MQ               566.       223. 
 6 FL               691.       142. 
 7 OO               733         NA  
 8 WN               942.       496. 
 9 B6              1062.       681. 
10 DL              1220.       644. 
11 AA              1350.       626. 
12 UA              1462.       778. 
13 F9              1620          0  
14 AS              2402          0  
15 VX              2495.        98.2
16 HA              4983          0  

4.3 Question 3:

4.3.1 Instructions:

Explain why the standard deviation is NA for one carrier, and why it is 0 for others. Demonstrate your answer using code.

4.3.2 Answer:

The standard deviation is N.A for the OO carrier only, this is because in order to calculate the standard deviation we need more than one observation (OO only has one observation distance in January).

Code
# NA Example using dummy-code: Standard deviation cannot be computed for a single observation

single_obs <- c(100000000000)

sd(single_obs)  
[1] NA
Code
# Analysing why we have NA:

OO_Jan_distances <- flights |>
  
  filter(month == 1, carrier == "OO") |>
  
    select(carrier, distance)

OO_Jan_distances
# A tibble: 1 × 2
  carrier distance
  <chr>      <dbl>
1 OO           733

The standard deviation is equal to zero for the YV carrier’s January distances, this is because there is no variation in the distances.

Code
# 0 Result: Standard deviation is 0 when all observations are identical

identical_obs <- c(1000000, 1000000, 1000000)

sd(identical_obs) 
[1] 0
Code
# Analysing why we have sd = 0

YV_Jan_distances <- flights |>
  
  filter(month == 1, carrier == "YV") |>
  
    select(carrier, distance)

YV_Jan_distances
# A tibble: 46 × 2
   carrier distance
   <chr>      <dbl>
 1 YV           229
 2 YV           229
 3 YV           229
 4 YV           229
 5 YV           229
 6 YV           229
 7 YV           229
 8 YV           229
 9 YV           229
10 YV           229
# ℹ 36 more rows

4.4 Question 4:

4.4.1 Instructions:

Using tidyr and dplyr where appropriate, construct a dataframe where the carriers are along the columns, and the rows are the average departure delay (dep_delay) flown by each carrier (carrier) in each month.

4.4.2 Answers:

Code
avg_dep_delay_df <- flights |>

    group_by(month, carrier) |>

      summarise(avg_dep_delay = mean(dep_delay), .groups = "drop") |>

       pivot_wider(names_from = carrier, values_from = avg_dep_delay)

 avg_dep_delay_df
# A tibble: 12 × 17
   month  `9E`    AA     AS    B6    DL    EV    F9    FL    HA    MQ    OO
   <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1     1    NA    NA  7.35     NA    NA    NA 10     NA   54.4     NA  67  
 2     2    NA    NA NA        NA    NA    NA NA     NA   17.4     NA  NA  
 3     3    NA    NA  8.42     NA    NA    NA 16.8   NA    1.16    NA  NA  
 4     4    NA    NA 11.3      NA    NA    NA 24.6   NA   -2.1     NA  NA  
 5     5    NA    NA  6.77     NA    NA    NA 35.9   NA   -1.45    NA  NA  
 6     6    NA    NA 13.1      NA    NA    NA 29.4   NA    1.47    NA  61  
 7     7    NA    NA  2.42     NA    NA    NA 31.8   NA   -1.71    NA  NA  
 8     8    NA    NA  2.87     NA    NA    NA 22.2   NA    1.68    NA  64  
 9     9    NA    NA -4.52     NA    NA    NA NA     NA   -5.44    NA  NA  
10    10    NA    NA  0.677    NA    NA    NA  9.70  NA   -5.10    NA  NA  
11    11    NA    NA  3.08     NA    NA    NA NA     16.9 -5.44    NA   0.8
12    12    NA    NA 18.0      NA    NA    NA 13.1   NA   -3.14    NA  NA  
# ℹ 5 more variables: UA <dbl>, US <dbl>, VX <dbl>, WN <dbl>, YV <dbl>

4.5 Question 5:

4.5.1 Instructions:

Calculate the proportion of flights that were delayed (dep_delay greater than 0) but arrived on or before time (arr_delay less than or equal to 0).

4.5.2 Answers:

Code
total_delayed <- flights |>
  filter(dep_delay > 0) |>
  nrow()

delayed_but_on_time <- flights |>
  filter(dep_delay > 0,  arr_delay <= 0) |>
  nrow()

prop_delayed_but_on_time <- (delayed_but_on_time)/(total_delayed)

prop_delayed_but_on_time
[1] 0.2759593

4.6 Question 6:

4.6.1 Instructions:

Using the airlines and flights datasets, do the following, showing the output from each step:

  • Identify routes that more than one airline flies

  • For each such route, calculate the average arrival delay for each airline (exclude NAs). Find the names of these airlines.

  • For each such route, identify the airline with the worst and best average arrival delay.

  • Identify the route with the greatest difference between the best and worst performing airlines

  • Determine the reason for this difference

4.6.2 Answers:

4.6.2.1 Identify Routes That More Than One Airline Flies

Code
routes_multiple_airlines <- flights |>

  group_by(origin, dest) |>

    summarise(num_airlines = n_distinct(carrier), .groups = "drop") |>

      filter(num_airlines > 1)

routes_multiple_airlines
# A tibble: 128 × 3
   origin dest  num_airlines
   <chr>  <chr>        <int>
 1 EWR    ATL              4
 2 EWR    AUS              2
 3 EWR    BDL              2
 4 EWR    BNA              2
 5 EWR    BOS              3
 6 EWR    BWI              2
 7 EWR    CHS              2
 8 EWR    CLE              2
 9 EWR    CLT              3
10 EWR    CVG              2
# ℹ 118 more rows

4.6.2.2 Calculate the Average Arrival Delay for Each Airline on These Routes

Code
route_airline_delays <- flights |>

  semi_join(routes_multiple_airlines, by = c("origin", "dest")) |>

    group_by(origin, dest, carrier) |>

      summarise(avg_arr_delay = mean(arr_delay, na.rm = TRUE), .groups = "drop") |>

        left_join(airlines, by = "carrier")

route_airline_delays
# A tibble: 343 × 5
   origin dest  carrier avg_arr_delay name                    
   <chr>  <chr> <chr>           <dbl> <chr>                   
 1 EWR    ATL   9E              -6.25 Endeavor Air Inc.       
 2 EWR    ATL   DL              10.0  Delta Air Lines Inc.    
 3 EWR    ATL   EV              19.5  ExpressJet Airlines Inc.
 4 EWR    ATL   UA              10.5  United Air Lines Inc.   
 5 EWR    AUS   UA               4.28 United Air Lines Inc.   
 6 EWR    AUS   WN             -11.2  Southwest Airlines Co.  
 7 EWR    BDL   EV               6.78 ExpressJet Airlines Inc.
 8 EWR    BDL   UA              22.6  United Air Lines Inc.   
 9 EWR    BNA   EV              17.7  ExpressJet Airlines Inc.
10 EWR    BNA   WN              -2.13 Southwest Airlines Co.  
# ℹ 333 more rows

4.6.2.3 Identify the Airline with the Worst and Best Average Arrival Delay for Each Route

Code
route_best_worst <- route_airline_delays |>

  group_by(origin, dest) |>

    summarise(

    best_airline = name[which.min(avg_arr_delay)],

    worst_airline = name[which.max(avg_arr_delay)],

    best_avg_delay = min(avg_arr_delay, na.rm = TRUE),

    worst_avg_delay = max(avg_arr_delay, na.rm = TRUE),

    .groups = "drop"

  )

route_best_worst
# A tibble: 128 × 6
   origin dest  best_airline        worst_airline best_avg_delay worst_avg_delay
   <chr>  <chr> <chr>               <chr>                  <dbl>           <dbl>
 1 EWR    ATL   Endeavor Air Inc.   ExpressJet A…         -6.25            19.5 
 2 EWR    AUS   Southwest Airlines… United Air L…        -11.2              4.28
 3 EWR    BDL   ExpressJet Airline… United Air L…          6.78            22.6 
 4 EWR    BNA   Southwest Airlines… ExpressJet A…         -2.13            17.7 
 5 EWR    BOS   ExpressJet Airline… JetBlue Airw…         -4.01             6.87
 6 EWR    BWI   Southwest Airlines… ExpressJet A…          5.95            20.1 
 7 EWR    CHS   United Air Lines I… ExpressJet A…        -14               16.2 
 8 EWR    CLE   ExpressJet Airline… United Air L…         -3.71             5.97
 9 EWR    CLT   US Airways Inc.     ExpressJet A…          0.920           20.5 
10 EWR    CVG   Endeavor Air Inc.   ExpressJet A…          1.40            21.2 
# ℹ 118 more rows

4.6.2.4 Identify the Route with the Greatest Difference Between the Best and Worst Performing Airlines

Code
route_with_max_diff <- route_best_worst |>

  mutate(delay_diff = worst_avg_delay - best_avg_delay) |>

    arrange(desc(delay_diff)) |>

      slice(1)

route_with_max_diff
# A tibble: 1 × 7
  origin dest  best_airline      worst_airline    best_avg_delay worst_avg_delay
  <chr>  <chr> <chr>             <chr>                     <dbl>           <dbl>
1 JFK    ATL   Endeavor Air Inc. ExpressJet Airl…           1.40             128
# ℹ 1 more variable: delay_diff <dbl>

4.6.2.5 Determine the Reason for This Difference

Code
investigation <- flights |>

  filter(origin == route_with_max_diff$origin,

         dest == route_with_max_diff$dest) |>

        group_by(carrier) |>

          summarise(

            n_flights = n(),
        
            avg_arr_delay = mean(arr_delay, na.rm = TRUE),
        
            avg_dep_delay = mean(dep_delay, na.rm = TRUE),
        
            .groups = "drop") |>

              left_join(airlines, by = "carrier")

investigation
# A tibble: 3 × 5
  carrier n_flights avg_arr_delay avg_dep_delay name                    
  <chr>       <int>         <dbl>         <dbl> <chr>                   
1 9E             55          1.40          1.47 Endeavor Air Inc.       
2 DL           1874          6.34         10.7  Delta Air Lines Inc.    
3 EV              1        128           124    ExpressJet Airlines Inc.

4.7 Question 7:

4.7.1 Instructions:

Identify all columns with missing entries, typos and any other inconsistencies in the dataset below (load it just by running the code; created using dput command, FYI):

4.7.2 Answers:

[1] "id"
 [1] "id_24" "id_19" "id_34" "id_37" "id_6"  "id_41" "id_17" "id_14" "id_32"
[10] "id_23" "id_35" "id_7"  "id_9"  "id_39" "id_50" "id_44" "id_1"  "id_10"
[19] "id_5"  "id_48"
[1] "30 unique entries not displayed"
[1] "_____________________"
[1] "age"
[1] 46 76 60 33 27
[1] "_____________________"
[1] "gender"
[1] "female" "male"   "femal" 
[1] "_____________________"
[1] "height"
[1] 165.4 157.7 157.4 197.4    NA
[1] "_____________________"
[1] "weight"
[1] 68.7 95.7 67.8 83.0 99.3
[1] "_____________________"
[1] "blood_type"
[1] "AB" "A"  "B"  "O" 
[1] "_____________________"
[1] "disease_status"
[1] "healthy"  "Healthy"  "diseased"
[1] "_____________________"
[1] "cholesterol"
[1] 199 189 235 196 250
[1] "_____________________"
[1] "glucose"
[1] 96 91 71 94 NA
[1] "_____________________"
[1] "smoker"
[1] "yes" "no" 
[1] "_____________________"
[1] "exercise"
[1] "none"       "occasional" "regular"   
[1] "_____________________"
[1] "income"
[1] 86851 68275 88295 57315 91326
[1] "_____________________"
[1] "education"
[1] "highschool" "PhD"        "bachelor"   "master"    
[1] "_____________________"
[1] "region"
[1] "East"  "North" "West"  "South"
[1] "_____________________"
[1] "marital_status"
[1] "single"   "widowed"  "divorced" "married" 
[1] "_____________________"
Warning: Not all unique entries displayed for these non-numeric cols: id
[1] "Missing values by column:"
# A tibble: 1 × 15
     id   age gender height weight blood_type disease_status cholesterol glucose
  <int> <int>  <int>  <int>  <int>      <int>          <int>       <int>   <int>
1     0     0      0      2      0          0              0           0       3
# ℹ 6 more variables: smoker <int>, exercise <int>, income <int>,
#   education <int>, region <int>, marital_status <int>
[1] "Gender counts:"
# A tibble: 3 × 2
  gender     n
  <chr>  <int>
1 femal      1
2 female    22
3 male      27
[1] "Disease status counts:"
# A tibble: 3 × 2
  disease_status     n
  <chr>          <int>
1 Healthy            1
2 diseased          19
3 healthy           30
[1] "Blood type counts:"
# A tibble: 4 × 2
  blood_type     n
  <chr>      <int>
1 A             14
2 AB             7
3 B             10
4 O             19
[1] "Smoker counts:"
# A tibble: 2 × 2
  smoker     n
  <chr>  <int>
1 no        23
2 yes       27
[1] "Exercise counts:"
# A tibble: 3 × 2
  exercise       n
  <chr>      <int>
1 none          16
2 occasional    19
3 regular       15
[1] "Education counts:"
# A tibble: 4 × 2
  education      n
  <chr>      <int>
1 PhD           12
2 bachelor      12
3 highschool    13
4 master        13
[1] "Region counts:"
# A tibble: 4 × 2
  region     n
  <chr>  <int>
1 East       9
2 North     16
3 South      9
4 West      16
[1] "Marital status counts:"
# A tibble: 4 × 2
  marital_status     n
  <chr>          <int>
1 divorced          15
2 married           12
3 single            11
4 widowed           12